{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import argparse\n",
    "import os\n",
    "import random\n",
    "from random import sample\n",
    "\n",
    "import pandas as pd\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train_click = pd.read_csv('./tcdata/train_click_log.csv')\n",
    "df_test_click = pd.read_csv('./tcdata/testB_click_log_Test_B.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>article_id</th>\n",
       "      <th>category_id</th>\n",
       "      <th>created_at_ts</th>\n",
       "      <th>words_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1513144419000</td>\n",
       "      <td>168</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1405341936000</td>\n",
       "      <td>189</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1408667706000</td>\n",
       "      <td>250</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>1408468313000</td>\n",
       "      <td>230</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1407071171000</td>\n",
       "      <td>162</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>364042</th>\n",
       "      <td>364042</td>\n",
       "      <td>460</td>\n",
       "      <td>1434034118000</td>\n",
       "      <td>144</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>364043</th>\n",
       "      <td>364043</td>\n",
       "      <td>460</td>\n",
       "      <td>1434148472000</td>\n",
       "      <td>463</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>364044</th>\n",
       "      <td>364044</td>\n",
       "      <td>460</td>\n",
       "      <td>1457974279000</td>\n",
       "      <td>177</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>364045</th>\n",
       "      <td>364045</td>\n",
       "      <td>460</td>\n",
       "      <td>1515964737000</td>\n",
       "      <td>126</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>364046</th>\n",
       "      <td>364046</td>\n",
       "      <td>460</td>\n",
       "      <td>1505811330000</td>\n",
       "      <td>479</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>364047 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        article_id  category_id  created_at_ts  words_count\n",
       "0                0            0  1513144419000          168\n",
       "1                1            1  1405341936000          189\n",
       "2                2            1  1408667706000          250\n",
       "3                3            1  1408468313000          230\n",
       "4                4            1  1407071171000          162\n",
       "...            ...          ...            ...          ...\n",
       "364042      364042          460  1434034118000          144\n",
       "364043      364043          460  1434148472000          463\n",
       "364044      364044          460  1457974279000          177\n",
       "364045      364045          460  1515964737000          126\n",
       "364046      364046          460  1505811330000          479\n",
       "\n",
       "[364047 rows x 4 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "articles = pd.read_csv('./tcdata/articles.csv')\n",
    "articles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>article_id</th>\n",
       "      <th>emb_0</th>\n",
       "      <th>emb_1</th>\n",
       "      <th>emb_2</th>\n",
       "      <th>emb_3</th>\n",
       "      <th>emb_4</th>\n",
       "      <th>emb_5</th>\n",
       "      <th>emb_6</th>\n",
       "      <th>emb_7</th>\n",
       "      <th>emb_8</th>\n",
       "      <th>...</th>\n",
       "      <th>emb_240</th>\n",
       "      <th>emb_241</th>\n",
       "      <th>emb_242</th>\n",
       "      <th>emb_243</th>\n",
       "      <th>emb_244</th>\n",
       "      <th>emb_245</th>\n",
       "      <th>emb_246</th>\n",
       "      <th>emb_247</th>\n",
       "      <th>emb_248</th>\n",
       "      <th>emb_249</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>-0.161183</td>\n",
       "      <td>-0.957233</td>\n",
       "      <td>-0.137944</td>\n",
       "      <td>0.050855</td>\n",
       "      <td>0.830055</td>\n",
       "      <td>0.901365</td>\n",
       "      <td>-0.335148</td>\n",
       "      <td>-0.559561</td>\n",
       "      <td>-0.500603</td>\n",
       "      <td>...</td>\n",
       "      <td>0.321248</td>\n",
       "      <td>0.313999</td>\n",
       "      <td>0.636412</td>\n",
       "      <td>0.169179</td>\n",
       "      <td>0.540524</td>\n",
       "      <td>-0.813182</td>\n",
       "      <td>0.286870</td>\n",
       "      <td>-0.231686</td>\n",
       "      <td>0.597416</td>\n",
       "      <td>0.409623</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>-0.523216</td>\n",
       "      <td>-0.974058</td>\n",
       "      <td>0.738608</td>\n",
       "      <td>0.155234</td>\n",
       "      <td>0.626294</td>\n",
       "      <td>0.485297</td>\n",
       "      <td>-0.715657</td>\n",
       "      <td>-0.897996</td>\n",
       "      <td>-0.359747</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.487843</td>\n",
       "      <td>0.823124</td>\n",
       "      <td>0.412688</td>\n",
       "      <td>-0.338654</td>\n",
       "      <td>0.320786</td>\n",
       "      <td>0.588643</td>\n",
       "      <td>-0.594137</td>\n",
       "      <td>0.182828</td>\n",
       "      <td>0.397090</td>\n",
       "      <td>-0.834364</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>-0.619619</td>\n",
       "      <td>-0.972960</td>\n",
       "      <td>-0.207360</td>\n",
       "      <td>-0.128861</td>\n",
       "      <td>0.044748</td>\n",
       "      <td>-0.387535</td>\n",
       "      <td>-0.730477</td>\n",
       "      <td>-0.066126</td>\n",
       "      <td>-0.754899</td>\n",
       "      <td>...</td>\n",
       "      <td>0.454756</td>\n",
       "      <td>0.473184</td>\n",
       "      <td>0.377866</td>\n",
       "      <td>-0.863887</td>\n",
       "      <td>-0.383365</td>\n",
       "      <td>0.137721</td>\n",
       "      <td>-0.810877</td>\n",
       "      <td>-0.447580</td>\n",
       "      <td>0.805932</td>\n",
       "      <td>-0.285284</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>-0.740843</td>\n",
       "      <td>-0.975749</td>\n",
       "      <td>0.391698</td>\n",
       "      <td>0.641738</td>\n",
       "      <td>-0.268645</td>\n",
       "      <td>0.191745</td>\n",
       "      <td>-0.825593</td>\n",
       "      <td>-0.710591</td>\n",
       "      <td>-0.040099</td>\n",
       "      <td>...</td>\n",
       "      <td>0.271535</td>\n",
       "      <td>0.036040</td>\n",
       "      <td>0.480029</td>\n",
       "      <td>-0.763173</td>\n",
       "      <td>0.022627</td>\n",
       "      <td>0.565165</td>\n",
       "      <td>-0.910286</td>\n",
       "      <td>-0.537838</td>\n",
       "      <td>0.243541</td>\n",
       "      <td>-0.885329</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>-0.279052</td>\n",
       "      <td>-0.972315</td>\n",
       "      <td>0.685374</td>\n",
       "      <td>0.113056</td>\n",
       "      <td>0.238315</td>\n",
       "      <td>0.271913</td>\n",
       "      <td>-0.568816</td>\n",
       "      <td>0.341194</td>\n",
       "      <td>-0.600554</td>\n",
       "      <td>...</td>\n",
       "      <td>0.238286</td>\n",
       "      <td>0.809268</td>\n",
       "      <td>0.427521</td>\n",
       "      <td>-0.615932</td>\n",
       "      <td>-0.503697</td>\n",
       "      <td>0.614450</td>\n",
       "      <td>-0.917760</td>\n",
       "      <td>-0.424061</td>\n",
       "      <td>0.185484</td>\n",
       "      <td>-0.580292</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>364042</th>\n",
       "      <td>364042</td>\n",
       "      <td>-0.055038</td>\n",
       "      <td>-0.962136</td>\n",
       "      <td>0.869436</td>\n",
       "      <td>-0.071523</td>\n",
       "      <td>-0.725294</td>\n",
       "      <td>0.434320</td>\n",
       "      <td>0.198312</td>\n",
       "      <td>-0.581154</td>\n",
       "      <td>0.702346</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.410550</td>\n",
       "      <td>0.564252</td>\n",
       "      <td>-0.463959</td>\n",
       "      <td>0.167907</td>\n",
       "      <td>-0.480068</td>\n",
       "      <td>0.652090</td>\n",
       "      <td>0.380880</td>\n",
       "      <td>0.433195</td>\n",
       "      <td>-0.662455</td>\n",
       "      <td>-0.222850</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>364043</th>\n",
       "      <td>364043</td>\n",
       "      <td>-0.136932</td>\n",
       "      <td>-0.995472</td>\n",
       "      <td>0.991298</td>\n",
       "      <td>0.031871</td>\n",
       "      <td>-0.915621</td>\n",
       "      <td>-0.658517</td>\n",
       "      <td>0.633090</td>\n",
       "      <td>-0.564356</td>\n",
       "      <td>0.676551</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.681986</td>\n",
       "      <td>-0.574185</td>\n",
       "      <td>-0.536908</td>\n",
       "      <td>0.688934</td>\n",
       "      <td>0.528204</td>\n",
       "      <td>0.162435</td>\n",
       "      <td>0.940364</td>\n",
       "      <td>0.989298</td>\n",
       "      <td>-0.761595</td>\n",
       "      <td>-0.414652</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>364044</th>\n",
       "      <td>364044</td>\n",
       "      <td>-0.251390</td>\n",
       "      <td>-0.976243</td>\n",
       "      <td>0.586097</td>\n",
       "      <td>0.643631</td>\n",
       "      <td>-0.663359</td>\n",
       "      <td>-0.093480</td>\n",
       "      <td>0.691553</td>\n",
       "      <td>-0.588281</td>\n",
       "      <td>0.902999</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.162220</td>\n",
       "      <td>-0.242030</td>\n",
       "      <td>-0.476131</td>\n",
       "      <td>0.352132</td>\n",
       "      <td>-0.311279</td>\n",
       "      <td>0.460574</td>\n",
       "      <td>-0.653077</td>\n",
       "      <td>-0.143725</td>\n",
       "      <td>0.068093</td>\n",
       "      <td>-0.705010</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>364045</th>\n",
       "      <td>364045</td>\n",
       "      <td>0.224342</td>\n",
       "      <td>-0.923288</td>\n",
       "      <td>-0.381742</td>\n",
       "      <td>0.687890</td>\n",
       "      <td>-0.773911</td>\n",
       "      <td>-0.103629</td>\n",
       "      <td>-0.406486</td>\n",
       "      <td>0.246004</td>\n",
       "      <td>0.255191</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.422999</td>\n",
       "      <td>0.390324</td>\n",
       "      <td>0.655911</td>\n",
       "      <td>-0.646753</td>\n",
       "      <td>-0.174031</td>\n",
       "      <td>0.698037</td>\n",
       "      <td>-0.317102</td>\n",
       "      <td>0.687132</td>\n",
       "      <td>-0.531512</td>\n",
       "      <td>0.010726</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>364046</th>\n",
       "      <td>364046</td>\n",
       "      <td>-0.257134</td>\n",
       "      <td>-0.994631</td>\n",
       "      <td>0.983792</td>\n",
       "      <td>-0.190975</td>\n",
       "      <td>-0.953720</td>\n",
       "      <td>-0.893823</td>\n",
       "      <td>0.708974</td>\n",
       "      <td>-0.557027</td>\n",
       "      <td>0.846842</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.490481</td>\n",
       "      <td>-0.689666</td>\n",
       "      <td>-0.661846</td>\n",
       "      <td>0.490945</td>\n",
       "      <td>0.736526</td>\n",
       "      <td>0.667668</td>\n",
       "      <td>0.902130</td>\n",
       "      <td>0.983873</td>\n",
       "      <td>-0.838183</td>\n",
       "      <td>-0.179283</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>364047 rows × 251 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        article_id     emb_0     emb_1     emb_2     emb_3     emb_4  \\\n",
       "0                0 -0.161183 -0.957233 -0.137944  0.050855  0.830055   \n",
       "1                1 -0.523216 -0.974058  0.738608  0.155234  0.626294   \n",
       "2                2 -0.619619 -0.972960 -0.207360 -0.128861  0.044748   \n",
       "3                3 -0.740843 -0.975749  0.391698  0.641738 -0.268645   \n",
       "4                4 -0.279052 -0.972315  0.685374  0.113056  0.238315   \n",
       "...            ...       ...       ...       ...       ...       ...   \n",
       "364042      364042 -0.055038 -0.962136  0.869436 -0.071523 -0.725294   \n",
       "364043      364043 -0.136932 -0.995472  0.991298  0.031871 -0.915621   \n",
       "364044      364044 -0.251390 -0.976243  0.586097  0.643631 -0.663359   \n",
       "364045      364045  0.224342 -0.923288 -0.381742  0.687890 -0.773911   \n",
       "364046      364046 -0.257134 -0.994631  0.983792 -0.190975 -0.953720   \n",
       "\n",
       "           emb_5     emb_6     emb_7     emb_8  ...   emb_240   emb_241  \\\n",
       "0       0.901365 -0.335148 -0.559561 -0.500603  ...  0.321248  0.313999   \n",
       "1       0.485297 -0.715657 -0.897996 -0.359747  ... -0.487843  0.823124   \n",
       "2      -0.387535 -0.730477 -0.066126 -0.754899  ...  0.454756  0.473184   \n",
       "3       0.191745 -0.825593 -0.710591 -0.040099  ...  0.271535  0.036040   \n",
       "4       0.271913 -0.568816  0.341194 -0.600554  ...  0.238286  0.809268   \n",
       "...          ...       ...       ...       ...  ...       ...       ...   \n",
       "364042  0.434320  0.198312 -0.581154  0.702346  ... -0.410550  0.564252   \n",
       "364043 -0.658517  0.633090 -0.564356  0.676551  ... -0.681986 -0.574185   \n",
       "364044 -0.093480  0.691553 -0.588281  0.902999  ... -0.162220 -0.242030   \n",
       "364045 -0.103629 -0.406486  0.246004  0.255191  ... -0.422999  0.390324   \n",
       "364046 -0.893823  0.708974 -0.557027  0.846842  ... -0.490481 -0.689666   \n",
       "\n",
       "         emb_242   emb_243   emb_244   emb_245   emb_246   emb_247   emb_248  \\\n",
       "0       0.636412  0.169179  0.540524 -0.813182  0.286870 -0.231686  0.597416   \n",
       "1       0.412688 -0.338654  0.320786  0.588643 -0.594137  0.182828  0.397090   \n",
       "2       0.377866 -0.863887 -0.383365  0.137721 -0.810877 -0.447580  0.805932   \n",
       "3       0.480029 -0.763173  0.022627  0.565165 -0.910286 -0.537838  0.243541   \n",
       "4       0.427521 -0.615932 -0.503697  0.614450 -0.917760 -0.424061  0.185484   \n",
       "...          ...       ...       ...       ...       ...       ...       ...   \n",
       "364042 -0.463959  0.167907 -0.480068  0.652090  0.380880  0.433195 -0.662455   \n",
       "364043 -0.536908  0.688934  0.528204  0.162435  0.940364  0.989298 -0.761595   \n",
       "364044 -0.476131  0.352132 -0.311279  0.460574 -0.653077 -0.143725  0.068093   \n",
       "364045  0.655911 -0.646753 -0.174031  0.698037 -0.317102  0.687132 -0.531512   \n",
       "364046 -0.661846  0.490945  0.736526  0.667668  0.902130  0.983873 -0.838183   \n",
       "\n",
       "         emb_249  \n",
       "0       0.409623  \n",
       "1      -0.834364  \n",
       "2      -0.285284  \n",
       "3      -0.885329  \n",
       "4      -0.580292  \n",
       "...          ...  \n",
       "364042 -0.222850  \n",
       "364043 -0.414652  \n",
       "364044 -0.705010  \n",
       "364045  0.010726  \n",
       "364046 -0.179283  \n",
       "\n",
       "[364047 rows x 251 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "articles_emb = pd.read_csv('./tcdata/articles_emb.csv')\n",
    "articles_emb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>article_1</th>\n",
       "      <th>article_2</th>\n",
       "      <th>article_3</th>\n",
       "      <th>article_4</th>\n",
       "      <th>article_5</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>200000</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>200001</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>200002</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>200003</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>200004</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49995</th>\n",
       "      <td>249995</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49996</th>\n",
       "      <td>249996</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49997</th>\n",
       "      <td>249997</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49998</th>\n",
       "      <td>249998</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49999</th>\n",
       "      <td>249999</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>50000 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       user_id  article_1  article_2  article_3  article_4  article_5\n",
       "0       200000          1          2          3          4          5\n",
       "1       200001          1          2          3          4          5\n",
       "2       200002          1          2          3          4          5\n",
       "3       200003          1          2          3          4          5\n",
       "4       200004          1          2          3          4          5\n",
       "...        ...        ...        ...        ...        ...        ...\n",
       "49995   249995          1          2          3          4          5\n",
       "49996   249996          1          2          3          4          5\n",
       "49997   249997          1          2          3          4          5\n",
       "49998   249998          1          2          3          4          5\n",
       "49999   249999          1          2          3          4          5\n",
       "\n",
       "[50000 rows x 6 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sample_submit = pd.read_csv('./tcdata/sample_submit.csv')\n",
    "sample_submit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>click_article_id</th>\n",
       "      <th>click_timestamp</th>\n",
       "      <th>click_environment</th>\n",
       "      <th>click_deviceGroup</th>\n",
       "      <th>click_os</th>\n",
       "      <th>click_country</th>\n",
       "      <th>click_region</th>\n",
       "      <th>click_referrer_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>249999</td>\n",
       "      <td>160974</td>\n",
       "      <td>1506959142820</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>249999</td>\n",
       "      <td>160417</td>\n",
       "      <td>1506959172820</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>249998</td>\n",
       "      <td>160974</td>\n",
       "      <td>1506959056066</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>12</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>249998</td>\n",
       "      <td>202557</td>\n",
       "      <td>1506959086066</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>12</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>249997</td>\n",
       "      <td>183665</td>\n",
       "      <td>1506959088613</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>1</td>\n",
       "      <td>15</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>518005</th>\n",
       "      <td>221924</td>\n",
       "      <td>70758</td>\n",
       "      <td>1508211323220</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>518006</th>\n",
       "      <td>207823</td>\n",
       "      <td>331116</td>\n",
       "      <td>1508211542618</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>518007</th>\n",
       "      <td>207823</td>\n",
       "      <td>234481</td>\n",
       "      <td>1508211850103</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>518008</th>\n",
       "      <td>207823</td>\n",
       "      <td>211442</td>\n",
       "      <td>1508212189949</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>518009</th>\n",
       "      <td>207823</td>\n",
       "      <td>211401</td>\n",
       "      <td>1508212315718</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>518010 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        user_id  click_article_id  click_timestamp  click_environment  \\\n",
       "0        249999            160974    1506959142820                  4   \n",
       "1        249999            160417    1506959172820                  4   \n",
       "2        249998            160974    1506959056066                  4   \n",
       "3        249998            202557    1506959086066                  4   \n",
       "4        249997            183665    1506959088613                  4   \n",
       "...         ...               ...              ...                ...   \n",
       "518005   221924             70758    1508211323220                  4   \n",
       "518006   207823            331116    1508211542618                  4   \n",
       "518007   207823            234481    1508211850103                  4   \n",
       "518008   207823            211442    1508212189949                  4   \n",
       "518009   207823            211401    1508212315718                  4   \n",
       "\n",
       "        click_deviceGroup  click_os  click_country  click_region  \\\n",
       "0                       1        17              1            13   \n",
       "1                       1        17              1            13   \n",
       "2                       1        12              1            13   \n",
       "3                       1        12              1            13   \n",
       "4                       1        17              1            15   \n",
       "...                   ...       ...            ...           ...   \n",
       "518005                  3         2              1            25   \n",
       "518006                  3         2              1            25   \n",
       "518007                  3         2              1            25   \n",
       "518008                  3         2              1            25   \n",
       "518009                  3         2              1            25   \n",
       "\n",
       "        click_referrer_type  \n",
       "0                         2  \n",
       "1                         2  \n",
       "2                         2  \n",
       "3                         2  \n",
       "4                         5  \n",
       "...                     ...  \n",
       "518005                    2  \n",
       "518006                    1  \n",
       "518007                    1  \n",
       "518008                    1  \n",
       "518009                    1  \n",
       "\n",
       "[518010 rows x 9 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test_click"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user_id</th>\n",
       "      <th>click_article_id</th>\n",
       "      <th>click_timestamp</th>\n",
       "      <th>click_environment</th>\n",
       "      <th>click_deviceGroup</th>\n",
       "      <th>click_os</th>\n",
       "      <th>click_country</th>\n",
       "      <th>click_region</th>\n",
       "      <th>click_referrer_type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>199999</td>\n",
       "      <td>160417</td>\n",
       "      <td>1507029570190</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>199999</td>\n",
       "      <td>5408</td>\n",
       "      <td>1507029571478</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>199999</td>\n",
       "      <td>50823</td>\n",
       "      <td>1507029601478</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>199998</td>\n",
       "      <td>157770</td>\n",
       "      <td>1507029532200</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>199998</td>\n",
       "      <td>96613</td>\n",
       "      <td>1507029671831</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1112618</th>\n",
       "      <td>33446</td>\n",
       "      <td>16346</td>\n",
       "      <td>1508212054157</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1112619</th>\n",
       "      <td>0</td>\n",
       "      <td>30760</td>\n",
       "      <td>1508211672520</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1112620</th>\n",
       "      <td>0</td>\n",
       "      <td>157507</td>\n",
       "      <td>1508211702520</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>17</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1112621</th>\n",
       "      <td>199178</td>\n",
       "      <td>234481</td>\n",
       "      <td>1508211513583</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1112622</th>\n",
       "      <td>199178</td>\n",
       "      <td>233578</td>\n",
       "      <td>1508211543583</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>25</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1112623 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         user_id  click_article_id  click_timestamp  click_environment  \\\n",
       "0         199999            160417    1507029570190                  4   \n",
       "1         199999              5408    1507029571478                  4   \n",
       "2         199999             50823    1507029601478                  4   \n",
       "3         199998            157770    1507029532200                  4   \n",
       "4         199998             96613    1507029671831                  4   \n",
       "...          ...               ...              ...                ...   \n",
       "1112618    33446             16346    1508212054157                  4   \n",
       "1112619        0             30760    1508211672520                  4   \n",
       "1112620        0            157507    1508211702520                  4   \n",
       "1112621   199178            234481    1508211513583                  4   \n",
       "1112622   199178            233578    1508211543583                  4   \n",
       "\n",
       "         click_deviceGroup  click_os  click_country  click_region  \\\n",
       "0                        1        17              1            13   \n",
       "1                        1        17              1            13   \n",
       "2                        1        17              1            13   \n",
       "3                        1        17              1            25   \n",
       "4                        1        17              1            25   \n",
       "...                    ...       ...            ...           ...   \n",
       "1112618                  3         2              1            25   \n",
       "1112619                  1        17              1            25   \n",
       "1112620                  1        17              1            25   \n",
       "1112621                  3         2              1            25   \n",
       "1112622                  3         2              1            25   \n",
       "\n",
       "         click_referrer_type  \n",
       "0                          1  \n",
       "1                          1  \n",
       "2                          1  \n",
       "3                          5  \n",
       "4                          5  \n",
       "...                      ...  \n",
       "1112618                    1  \n",
       "1112619                    2  \n",
       "1112620                    2  \n",
       "1112621                    2  \n",
       "1112622                    2  \n",
       "\n",
       "[1112623 rows x 9 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train_click"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_users = df_train_click['user_id'].values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "val_users = sample(train_users, 50000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "click_list = []\n",
    "valid_query_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "groups = df_train_click.groupby(['user_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████| 200000/200000 [09:33<00:00, 348.72it/s]\n"
     ]
    }
   ],
   "source": [
    "for user_id, g in tqdm(groups):\n",
    "    if user_id in val_users:\n",
    "        valid_query = g.tail(1)\n",
    "        valid_query_list.append(\n",
    "        valid_query[['user_id', 'click_article_id']])\n",
    "        train_click = g.head(g.shape[0] - 1)\n",
    "        click_list.append(train_click)\n",
    "    else:\n",
    "        click_list.append(g)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train_click = pd.concat(click_list, sort=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_valid_query = pd.concat(valid_query_list, sort=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_users = df_test_click['user_id'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_query_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████████████████████████████████████████████| 50000/50000 [00:00<00:00, 1790585.80it/s]\n"
     ]
    }
   ],
   "source": [
    "for user in tqdm(test_users):\n",
    "    test_query_list.append([user, -1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test_query = pd.DataFrame(test_query_list,\n",
    "                    columns=['user_id', 'click_article_id'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_query = pd.concat([df_valid_query, df_test_query],\n",
    "                         sort=False).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_click = pd.concat([df_train_click, df_test_click],\n",
    "                         sort=False).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_click = df_click.sort_values(['user_id',\n",
    "                                     'click_timestamp']).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.makedirs('./user_data/data/offline', exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_click.to_pickle('./user_data/data/offline/click.pkl')\n",
    "df_query.to_pickle('./user_data/data/offline/query.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:tf1.14]",
   "language": "python",
   "name": "conda-env-tf1.14-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
